load requirements

rm(list = ls())
{
  library(stringr)
  library(dplyr)
  library(plyr)
  library(magrittr)
  library(survival)
  library(grid)
  library(forestploter)
  library(openxlsx)
  library(pROC)
  library(rms)
  library(ggsci)
  library(RColorBrewer)
  library(timeROC)
  library(randomForest)
  library(tidyverse)
  library(ggplot2)
  library(ggthemes)
  library(ggpubr)
}

Correlation between Clinical parameters and MS

clinic <- read.table(file = './00data_preparation/ACC_clinic.tsv',sep = '\t',header=T)
variable <- c('SEX',
              'AGE_new',
              # 'RACE',
              'Ethnicity.Race',
              'Clinical.Stage',
              'T.stage',
              'Lymph.node',
              'ATYPICAL_MITOTIC_FIGURES',
              'CAPSULAR_INVASION',
              'Clinical.Status.3.Mo.Post.Op',
              'Distant.Metastasis',
              'CYTOPLASM_PRESENCE_LESS_THAN_EQUAL_25_PERCENT',
              'DIFFUSE_ARCHITECTURE',
              'Excessive.Hormone',
              'LATERALITY',
              'METASTATIC_SITE_1',
              'METASTATIC_SITE_2',
              'MITOTIC_RATE',
              'NECROSIS',
              'NUCLEAR_GRADE',
              'PHARMACEUTICAL_TX_ADJUVANT',
              'PHARM_TX_MITOTANE_ADJUVANT',
              'PHARM_TX_MITOTANE_THERAPUTIC_AT_REC',
              'RADIATION_TREATMENT_ADJUVANT',
              'Surgical.margin',
              'SINUSOID_INVASION',
              'PRIMARY_THERAPY_OUTCOME',
              'Neoplasm.Status',
              'WEISS_VENOUS_INVASION',
              'TUMOR_WEIGHT_new')
data_type <- c('Without_combined',"Likely_combined","PC_combined","Putative_combined",'Strigent_combined')

# Tables
result_list <- list()
for (j in 1:5) {
  result <- data.frame()
  for (i in 1:length(variable)) {
    name <- variable[i]
    clinic <- dplyr::mutate(clinic,MS=clinic[,data_type[j]])
    dat <- data.frame(table(clinic[,c('MS',name)])) %>% 
      ddply(.(MS),transform,percent=Freq/sum(Freq)*100) 
    colnames(dat)[1:2] <- c('Var1','Var2')
    x <- matrix(dat$Freq,ncol=2) %>% as.data.frame() %>% set_colnames(c('MS1','MS2'))
    x <- tibble::rownames_to_column(x,var = 'parameter') %>% dplyr::mutate(parameter=unique(dat$Var2))
    result <- rbind(result,x)
  }
  result_list[[j]] <- result
}

Cor_MS_and_Clin_NR <- result_list[[1]];knitr::kable(Cor_MS_and_Clin_NR)
parameter MS1 MS2
Female 34 14
Male 21 8
<60 42 18
>60 13 4
NOT WHITE 2 0
WHITE 49 16
Stage I 6 3
Stage II 24 13
Stage III 11 5
Stage IV 14 1
T1 6 3
T2 28 14
T3 6 2
T4 15 3
N0 47 21
N1 8 1
Atypical Mitotic Figures Absent 16 13
Atypical Mitotic Figures Present 26 4
Invasion of Tumor Capsule Absent 20 10
Invasion of Tumor Capsule Present 30 12
Absent 35 19
Present 16 2
M0 41 21
M1 14 1
Cytoplasm presence <= 25% Absent 7 3
Cytoplasm presence <= 25% Present 35 14
Diffuse Architecture Absent 14 5
Diffuse Architecture Present 29 12
Absent 15 11
Present 36 11
Left 32 11
Right 23 11
Lung 6 2
Not Lung 9 0
Liver 8 0
Not Liver 7 2
Mitotic Rate > 5/50 HPF Absent 19 10
Mitotic Rate > 5/50 HPF Present 30 10
Necrosis Absent 13 4
Necrosis Present 39 17
I-II 11 2
III-IV 33 15
NO 18 7
YES 36 14
NO 4 0
YES 25 14
NO 9 7
YES 2 2
NO 41 18
YES 12 3
R0 37 18
R1+R2 14 1
Sinusoid Invasion Absent 22 13
Sinusoid Invasion Present 20 4
CR 19 15
PD 15 1
TUMOR FREE 22 18
WITH TUMOR 32 4
Venous Invasion Absent 26 13
Venous Invasion Present 23 7
<median 31 9
>median 24 13
Cor_MS_and_Clin_LR <- result_list[[2]];knitr::kable(Cor_MS_and_Clin_LR)
parameter MS1 MS2
Female 32 16
Male 20 9
<60 41 19
>60 11 6
NOT WHITE 2 0
WHITE 48 17
Stage I 5 4
Stage II 23 14
Stage III 11 5
Stage IV 13 2
T1 5 4
T2 27 15
T3 6 2
T4 14 4
N0 44 24
N1 8 1
Atypical Mitotic Figures Absent 14 15
Atypical Mitotic Figures Present 25 5
Invasion of Tumor Capsule Absent 19 11
Invasion of Tumor Capsule Present 28 14
Absent 33 21
Present 15 3
M0 39 23
M1 13 2
Cytoplasm presence <= 25% Absent 6 4
Cytoplasm presence <= 25% Present 33 16
Diffuse Architecture Absent 12 7
Diffuse Architecture Present 28 13
Absent 14 12
Present 34 13
Left 30 13
Right 22 12
Lung 6 2
Not Lung 8 1
Liver 7 1
Not Liver 7 2
Mitotic Rate > 5/50 HPF Absent 17 12
Mitotic Rate > 5/50 HPF Present 29 11
Necrosis Absent 12 5
Necrosis Present 37 19
I-II 9 4
III-IV 32 16
NO 16 9
YES 35 15
NO 3 1
YES 25 14
NO 9 7
YES 2 2
NO 38 21
YES 12 3
R0 35 20
R1+R2 13 2
Sinusoid Invasion Absent 21 14
Sinusoid Invasion Present 18 6
CR 18 16
PD 14 2
TUMOR FREE 20 20
WITH TUMOR 31 5
Venous Invasion Absent 24 15
Venous Invasion Present 22 8
<median 29 11
>median 23 14
Cor_MS_and_Clin_CR <- result_list[[3]];knitr::kable(Cor_MS_and_Clin_CR)
parameter MS1 MS2
Female 33 15
Male 15 14
<60 36 24
>60 12 5
NOT WHITE 2 0
WHITE 45 20
Stage I 4 5
Stage II 20 17
Stage III 11 5
Stage IV 13 2
T1 4 5
T2 24 18
T3 6 2
T4 14 4
N0 40 28
N1 8 1
Atypical Mitotic Figures Absent 15 14
Atypical Mitotic Figures Present 23 7
Invasion of Tumor Capsule Absent 19 11
Invasion of Tumor Capsule Present 25 17
Absent 29 25
Present 15 3
M0 35 27
M1 13 2
Cytoplasm presence <= 25% Absent 7 3
Cytoplasm presence <= 25% Present 31 18
Diffuse Architecture Absent 13 6
Diffuse Architecture Present 25 16
Absent 10 16
Present 35 12
Left 27 16
Right 21 13
Lung 5 3
Not Lung 9 0
Liver 8 0
Not Liver 6 3
Mitotic Rate > 5/50 HPF Absent 17 12
Mitotic Rate > 5/50 HPF Present 28 12
Necrosis Absent 10 7
Necrosis Present 36 20
I-II 10 3
III-IV 29 19
NO 14 11
YES 33 17
NO 3 1
YES 23 16
NO 7 9
YES 2 2
NO 35 24
YES 11 4
R0 31 24
R1+R2 13 2
Sinusoid Invasion Absent 20 15
Sinusoid Invasion Present 17 7
CR 15 19
PD 15 1
TUMOR FREE 15 25
WITH TUMOR 32 4
Venous Invasion Absent 21 18
Venous Invasion Present 22 8
<median 27 13
>median 21 16
Cor_MS_and_Clin_PR <- result_list[[4]];knitr::kable(Cor_MS_and_Clin_PR)
parameter MS1 MS2
Female 32 16
Male 20 9
<60 40 20
>60 12 5
NOT WHITE 2 0
WHITE 48 17
Stage I 4 5
Stage II 22 15
Stage III 12 4
Stage IV 14 1
T1 4 5
T2 26 16
T3 6 2
T4 16 2
N0 44 24
N1 8 1
Atypical Mitotic Figures Absent 14 15
Atypical Mitotic Figures Present 24 6
Invasion of Tumor Capsule Absent 18 12
Invasion of Tumor Capsule Present 29 13
Absent 31 23
Present 17 1
M0 38 24
M1 14 1
Cytoplasm presence <= 25% Absent 7 3
Cytoplasm presence <= 25% Present 31 18
Diffuse Architecture Absent 13 6
Diffuse Architecture Present 26 15
Absent 13 13
Present 35 12
Left 30 13
Right 22 12
Lung 7 1
Not Lung 9 0
Liver 8 0
Not Liver 8 1
Mitotic Rate > 5/50 HPF Absent 17 12
Mitotic Rate > 5/50 HPF Present 29 11
Necrosis Absent 11 6
Necrosis Present 38 18
I-II 10 3
III-IV 30 18
NO 16 9
YES 35 15
NO 3 1
YES 24 15
NO 9 7
YES 2 2
NO 37 22
YES 13 2
R0 34 21
R1+R2 14 1
Sinusoid Invasion Absent 20 15
Sinusoid Invasion Present 18 6
CR 16 18
PD 16 0
TUMOR FREE 18 22
WITH TUMOR 33 3
Venous Invasion Absent 22 17
Venous Invasion Present 24 6
<median 28 12
>median 24 13
Cor_MS_and_Clin_SR <- result_list[[5]];knitr::kable(Cor_MS_and_Clin_SR)
parameter MS1 MS2
Female 23 25
Male 13 16
<60 28 32
>60 8 9
NOT WHITE 0 2
WHITE 31 34
Stage I 6 3
Stage II 14 23
Stage III 7 9
Stage IV 9 6
T1 6 3
T2 17 25
T3 3 5
T4 10 8
N0 31 37
N1 5 4
Atypical Mitotic Figures Absent 15 14
Atypical Mitotic Figures Present 15 15
Invasion of Tumor Capsule Absent 14 16
Invasion of Tumor Capsule Present 19 23
Absent 25 29
Present 10 8
M0 27 35
M1 9 6
Cytoplasm presence <= 25% Absent 6 4
Cytoplasm presence <= 25% Present 24 25
Diffuse Architecture Absent 9 10
Diffuse Architecture Present 21 20
Absent 9 17
Present 26 21
Left 19 24
Right 17 17
Lung 3 5
Not Lung 7 2
Liver 7 1
Not Liver 3 6
Mitotic Rate > 5/50 HPF Absent 8 21
Mitotic Rate > 5/50 HPF Present 24 16
Necrosis Absent 6 11
Necrosis Present 28 28
I-II 5 8
III-IV 25 23
NO 7 18
YES 28 22
NO 3 1
YES 22 17
NO 7 9
YES 3 1
NO 30 29
YES 5 10
R0 25 30
R1+R2 8 7
Sinusoid Invasion Absent 14 21
Sinusoid Invasion Present 15 9
CR 17 17
PD 10 6
TUMOR FREE 14 26
WITH TUMOR 21 15
Venous Invasion Absent 19 20
Venous Invasion Present 13 17
<median 17 23
>median 19 18
# P-value table
result <- data.frame(Parameter=variable,
                     Without_pvalue=NA,
                     Likely_pvalue=NA,
                     PC_pvalue=NA,
                     Putative_pvalue=NA,
                     Strigent_pvalue=NA)
for (i in 1:length(variable)) {
  for (j in 1:5) {
    name <- variable[i]
    clinic <- dplyr::mutate(clinic,MS=clinic[,data_type[j]])
    dat <- data.frame(table(clinic[,c('MS',name)])) %>% 
      ddply(.(MS),transform,percent=Freq/sum(Freq)*100) 
    colnames(dat)[1:2] <- c('Var1','Var2')
    x <- matrix(dat$Freq,ncol=2)
    if (sum(x<5)>0){
      a <- fisher.test(x)
    }else{
      a <- chisq.test(x)
    }
    result[i,j+1] <- round(a$p.value,2)
  }
}
result[result<0.05] <- '*'
knitr::kable(result)
Parameter Without_pvalue Likely_pvalue PC_pvalue Putative_pvalue Strigent_pvalue
SEX 1 1 0.21 1 0.98
AGE_new 0.76 1 0.61 0.99 1
Ethnicity.Race 1 1 1 1 0.5
Clinical.Stage 0.19 0.29 0.09 * 0.3
T.stage 0.63 0.63 0.25 0.07 0.42
Lymph.node 0.43 0.26 0.14 0.26 0.73
ATYPICAL_MITOTIC_FIGURES * * 0.08 * 1
CAPSULAR_INVASION 0.86 0.97 0.93 0.59 1
Clinical.Status.3.Mo.Post.Op 0.07 0.15 * * 0.68
Distant.Metastasis 0.05 0.12 * * 0.39
CYTOPLASM_PRESENCE_LESS_THAN_EQUAL_25_PERCENT 1 0.72 1 1 0.73
DIFFUSE_ARCHITECTURE 1 0.92 0.79 0.93 1
Excessive.Hormone 0.16 0.18 * 0.06 0.15
LATERALITY 0.69 0.82 1 0.82 0.78
METASTATIC_SITE_1 0.21 0.58 0.08 0.47 0.15
METASTATIC_SITE_2 0.47 1 0.21 1 0.05
MITOTIC_RATE 0.56 0.34 0.47 0.34 *
NECROSIS 0.76 0.96 0.9 1 0.43
NUCLEAR_GRADE 0.32 1 0.34 0.51 0.58
PHARMACEUTICAL_TX_ADJUVANT 1 0.79 0.55 0.79 *
PHARM_TX_MITOTANE_ADJUVANT 0.29 1 1 1 0.63
PHARM_TX_MITOTANE_THERAPUTIC_AT_REC 1 1 1 1 0.58
RADIATION_TREATMENT_ADJUVANT 0.53 0.36 0.38 0.12 0.36
Surgical.margin 0.05 0.12 * * 0.8
SINUSOID_INVASION 0.14 0.36 0.43 0.26 0.15
PRIMARY_THERAPY_OUTCOME * * * * 0.6
Neoplasm.Status * * * * 0.07
WEISS_VENOUS_INVASION 0.52 0.44 0.16 0.07 0.84
TUMOR_WEIGHT_new 0.33 0.47 0.46 0.81 0.58

Multivariate COX

result <- data.frame()
res.cox.all <- coxph(formula = Surv(OS.time,OS)~ 
                       # Without_combined +
                       # Likely_combined+
                       # PC_combined+
                       Putative_combined+
                       Clinical.Stage+
                       # Excessive.Hormone+
                       ATYPICAL_MITOTIC_FIGURES+
                       Surgical.margin+
                       Clinical.Status.3.Mo.Post.Op+
                       Distant.Metastasis+
                       # Neoplasm.Status+
                       PRIMARY_THERAPY_OUTCOME,
                       data = clinic)
x <- summary(res.cox.all)
result_cox <- data.frame(HR=signif(x$conf.int[,1],digits=2),
                         HR.confit.lower=signif(x$conf.int[,"lower .95"],2),
                         HR.confit.upper=signif(x$conf.int[,"upper .95"],2),
                         p.value =signif(x$coefficients[,"Pr(>|z|)"],digits = 3))

result_cox <- result_cox %>%
  mutate('HR(95%CI)'=paste(.$HR,"(",result_cox$HR.confit.lower,"-",.$HR.confit.upper,")",sep = "")) %>%
  dplyr::mutate(p.value=as.numeric(.$p.value)) %>%
  dplyr::mutate(p.value= ifelse(.$p.value<0.001,'***',ifelse(.$p.value<0.01,'**',ifelse(.$p.value<0.05,'*','NS'))))%>%
  tibble::rownames_to_column(var = 'Parameter')
result_cox[,2:4] <- lapply(result_cox[,2:4],as.numeric)
result <- rbind(result,result_cox)

dat <- result
dat$` ` <- paste(rep(" ", nrow(dat)), collapse = " ")
dat$Parameter <- ifelse(is.na(dat$HR), 
                       dat$Parameter,
                      paste0("   ", dat$Parameter))
dat <- dat[,c(1,7,2:6)]
tm <- forest_theme(base_size = 15,  #文本的大小
                   footnote_col = "red")
forest(dat[,c(1:2,6:7)],
       est = dat$HR,
       lower = dat$HR.confit.lower, 
       upper = dat$HR.confit.upper,
       sizes = 1,
       ci_column = 2,
       ref_line = 1,
       # arrow_lab = c("Placebo Better", "Treatment Better"),
       xlim = c(0, 100),
       ticks_at = c(1,50),
       footnote = "",
       theme = tm)

AUC of signature microbiome

micro_sig <- read.xlsx('./01clinical_analysis/source_data/Core_features_coefficients.xlsx',sheet = 2,colNames = FALSE,check.names = F)

microbiome_NR <- read.csv(file = './01clinical_analysis/source_data/Voom-SNM-ACC.csv',header = T,row.names = 1)  %>% .[,micro_sig$X2] %>% set_colnames(str_sub(colnames(.),str_locate(colnames(.),'g__')[,1],str_length(colnames(.))))%>% tibble::rownames_to_column(var = 'names')
microbiome_LR <- read.csv(file = './01clinical_analysis/source_data/Voom-SNM-Filter-Likely-ACC.csv',header = T,row.names = 1)  %>% .[,micro_sig$X2] %>% set_colnames(str_sub(colnames(.),str_locate(colnames(.),'g__')[,1],str_length(colnames(.))))%>% tibble::rownames_to_column(var = 'names')
microbiome_CR <- read.csv(file = './01clinical_analysis/source_data/Voom-SNM-Filter-Plate_Center-ACC.csv',header = T,row.names = 1)  %>% .[,micro_sig$X2] %>% set_colnames(str_sub(colnames(.),str_locate(colnames(.),'g__')[,1],str_length(colnames(.))))%>% tibble::rownames_to_column(var = 'names')
microbiome_PR <- read.csv(file = './01clinical_analysis/source_data/Voom-SNM-Filter-Putative-ACC.csv',header = T,row.names = 1)  %>% .[,micro_sig$X2] %>% set_colnames(str_sub(colnames(.),str_locate(colnames(.),'g__')[,1],str_length(colnames(.))))%>% tibble::rownames_to_column(var = 'names')

clinic.NR <- merge(clinic,microbiome_NR,by='names') 
clinic.LR <- merge(clinic,microbiome_LR,by='names') 
clinic.CR <- merge(clinic,microbiome_CR,by='names')
clinic.PR <- merge(clinic,microbiome_PR,by='names') 

# AUC curve
roc_result <- data.frame(genus=micro_sig$X1,
                         NR=NA,
                         LR=NA,
                         CR=NA,
                         PR=NA)
clinic_type <- list(clinic.NR,clinic.LR,clinic.CR,clinic.PR)
data_type <- c('NR','LR','CR','PR')

for (i in 1:4) {
  signature_combine <- psm(Surv(OS.time,OS) ~ g__Ilyobacter+g__Isosphaera+g__Singulisphaera+g__Thermopetrobacter+g__Chelativorans+g__Pseudorhodobacter+g__Wenxinia+g__Acidocella+g__Diaphorobacter+g__Vitreoscilla+g__Bacteriovorax+g__Desulfuromonas+g__Proteus+g__Terrimicrobium+g__Haloferula, 
                           data =  clinic_type[[i]], dist='lognormal') 
  clinic$signature_combine <- predict(signature_combine)[match(clinic$names,clinic_type[[i]]$names)]
  colnames(clinic)[which(colnames(clinic)=='signature_combine')] <- paste0('signature_combine_',data_type[i])
}

roc.list <- roc(OS ~ signature_combine_NR+signature_combine_LR+signature_combine_CR+signature_combine_PR, 
                smooth=T,
                data = clinic)
labels=c()#blank list
for(i in 1:4){
  x=c('15 signature combine(NR), AUC=','15 signature combine(LR), AUC=','15 signature combine(CR), AUC=','15 signature combine(PR), AUC=')
  x.1 <- paste0(x[i],round((roc.list[[i]])$auc,2))
  labels <- c(labels,x.1)
};labels
## [1] "15 signature combine(NR), AUC=0.84" "15 signature combine(LR), AUC=0.83"
## [3] "15 signature combine(CR), AUC=0.82" "15 signature combine(PR), AUC=0.83"
ggroc(roc.list,linetype = 1, size = 1.2) +
  scale_colour_manual(name = "",
                      labels =labels,
                      values=pal_npg()(4))+
  theme_bw() +
  theme(panel.grid.major = element_blank(),
        panel.border = element_rect(fill=NA,color="black", size=1.2, linetype="solid"))+
  geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1),
               color="grey", linetype="dashed")

Time dependent ROC

clinic.roc <- clinic %>% 
  dplyr::mutate(OS.time= round(.$OS.time/30,2)) %>% 
  dplyr::mutate(NR=ifelse(.$Without_combined=='MS2',1,0)) %>% 
  dplyr::mutate(LR=ifelse(.$Likely_combined=='MS2',1,0)) %>% 
  dplyr::mutate(CR=ifelse(.$PC_combined=='MS2',1,0)) %>% 
  dplyr::mutate(PR=ifelse(.$Putative_combined=='MS2',1,0)) %>% 
  dplyr::mutate(stage.new=ifelse(.$Clinical.Stage=='Stage IV',3,
                                 ifelse(.$Clinical.Stage=='Stage III',2,
                                        ifelse(.$Clinical.Stage=='Stage II',1,0)))) %>% 
  dplyr::mutate(T.stage.new=gsub('T','',.$T.stage)) %>% 
  dplyr::mutate(T.stage.new=as.numeric(.$T.stage.new)-1)

res.cox.stage <- coxph(Surv(OS.time,OS)~T.stage.new,data=clinic.roc)
clinic.roc$cox.stage <- predict(res.cox.stage)
ROC.stage <- timeROC(T=clinic.roc$OS.time,delta=clinic.roc$OS,marker=clinic.roc$cox.stage,iid = T,cause=1,weighting="marginal",times=quantile(clinic.roc$OS.time,probs=seq(0.1,0.9,0.05)))
plotAUCcurve(ROC.stage,col = 'grey')

palette <- pal_npg()(6)
pval <- data.frame()
for (i in 1:4) {
  data_type=c("Without_combined","Likely_combined","PC_combined","Putative_combined")
  clinic.roc <- dplyr::mutate(clinic.roc,MS=clinic.roc[,data_type[i]])
  res.cox <- coxph(Surv(OS.time,OS)~MS+T.stage.new,
                            data=clinic.roc)
  clinic.roc$TYPE <- predict(res.cox)
  ROC <- timeROC(T=clinic.roc$OS.time,delta=clinic.roc$OS,marker=clinic.roc$TYPE,iid = T,cause=1,weighting="marginal",times=quantile(clinic.roc$OS.time,probs=seq(0.1,0.9,0.05)))
  print(plotAUCcurve(ROC,add = T,col = palette[i]))
  outcome <- compare(ROC,ROC.stage,adjusted = T,abseps = 1e-06)
  table <- as.data.frame(outcome$p_values_AUC)
  pval <- rbind(pval,table)
}
## NULL
## NULL
## NULL
## NULL
abline(v=22.07,lwd=1,lty=2,col="#BC3C29FF")#add a vertical line
abline(v=26.96,lwd=1,lty=2,col="#BC3C29FF")
abline(v=31.894,lwd=1,lty=2,col="#BC3C29FF")
abline(v=98.408,lwd=1,lty=2,col="#BC3C29FF")
legend("bottomleft",c('MS(NR)+Stage','MS(LR)+Stage','MS(CR)+Stage','MS(PR)+Stage','Stage Only'),col=c(pal_npg()(4),'grey'),lty=1,lwd=3)

Random forest

microbiome_NR <- read.csv(file = './01clinical_analysis/source_data/Voom-SNM-ACC.csv',header = T,row.names = 1)   
microbiome_LR <- read.csv(file = './01clinical_analysis/source_data/Voom-SNM-Filter-Likely-ACC.csv',header = T,row.names = 1)  
microbiome_CR <- read.csv(file = './01clinical_analysis/source_data/Voom-SNM-Filter-Plate_Center-ACC.csv',header = T,row.names = 1)  
microbiome_PR <- read.csv(file = './01clinical_analysis/source_data/Voom-SNM-Filter-Putative-ACC.csv',header = T,row.names = 1)  

variable <- c('Clinical.Stage',
              'ATYPICAL_MITOTIC_FIGURES',
              'Clinical.Status.3.Mo.Post.Op',
              'Distant.Metastasis',
              'Excessive.Hormone',
              'Surgical.margin',
              'PRIMARY_THERAPY_OUTCOME',
              'Neoplasm.Status')
result <- data.frame(parameter=variable,NR=NA,LR=NA,CR=NA,PR=NA)
for (i in 1:4) {
  dat <- list(microbiome_NR,microbiome_LR,microbiome_CR,microbiome_PR)[[i]]
  for (j in 1:length(variable)) {
    mat <-  dat  %>% dplyr::mutate(group=as.factor(clinic[,variable[j]])) %>% drop_na(group)
    set.seed(430)
    rf = randomForest(group~ .,data=mat, ntree=500, importance=TRUE, proximity=TRUE)
    res1 <- as.data.frame(rf$importance) %>% .[order(.$MeanDecreaseAccuracy,decreasing = T),]
    res2 <- as.data.frame(rf$importance) %>% .[order(.$MeanDecreaseGini,decreasing = T),]
    x <- rownames(res1)[1:10] %>% str_sub(.,str_locate(.,'g__')[,1],str_length(.))
    y <- rownames(res2)[1:10]%>% str_sub(.,str_locate(.,'g__')[,1],str_length(.))
    result[j,i+1] <- paste(intersect(x,y),collapse=";")
  }
}
knitr::kable(result)
parameter NR LR CR PR
Clinical.Stage g__Blautia;g__Devosia;g__Prevotella;g__Flavonifractor g__Blautia;g__Clostridium;g__Chlamydia;g__Erysipelothrix;g__Lachnoclostridium g__Blautia;g__Candidatus_Jettenia;g__Shinella g__Blautia;g__Flavonifractor;g__Aquitalea;g__Rhodoferax
ATYPICAL_MITOTIC_FIGURES g__Methanospirillum;g__Marinitoga g__Pseudohongiella;g__Marinitoga;g__Syntrophorhabdus;g__Arenimonas;g__Arcobacter;g__Halorhodospira g__Marinitoga;g__Arcobacter;g__Ruania;g__Idiomarina;g__Methanospirillum g__Methanospirillum;g__Marinitoga;g__Candidatus_Methanoperedens
Clinical.Status.3.Mo.Post.Op g__Raoultella;g__Proboscivirus;g__Bizionia;g__T7likevirus;g__Crocosphaera;g__Arthrobacter;g__Oceanicaulis g__Blautia;g__Bacteroides;g__Tenacibaculum;g__Spo1virus g__T4likevirus;g__Crocosphaera;g__Stigmatella g__Cylindrospermopsis;g__Cytomegalovirus;g__Blautia
Distant.Metastasis g__Microvirga;g__Candidatus_Jettenia;g__Bacteroides;g__Denitrovibrio;g__Cosenzaea g__Shewanella;g__Bacteroides;g__Spo1virus;g__Citrobacter g__Candidatus_Jettenia;g__Microvirga;g__Spo1virus;g__Bacteroides;g__Neochlamydia;g__Oscillatoria g__Spo1virus;g__Halorhodospira;g__Salinispora;g__Pseudoclavibacter;g__Sodalis
Excessive.Hormone g__Spiroplasma;g__Marichromatium;g__Entomoplasma;g__Pantoea;g__Leucobacter g__Algiphilus;g__Streptacidiphilus;g__Marichromatium;g__Oceanobacillus;g__Caldisericum g__Moraxella;g__Marichromatium;g__Spiroplasma g__Marichromatium;g__Streptacidiphilus;g__Spiroplasma;g__Pantoea;g__Tannerella
Surgical.margin g__Sodalis;g__Blattabacterium;g__Proboscivirus;g__Ascovirus;g__Plesiomonas g__Mamastrovirus;g__Geobacillus;g__Blautia;g__Thiorhodovibrio g__Methanosarcina;g__Blattabacterium;g__Proboscivirus;g__Methanocella;g__Sodalis;g__Geobacillus g__Proboscivirus;g__Sodalis
PRIMARY_THERAPY_OUTCOME g__Syntrophus;g__Azospira;g__Proboscivirus g__Jonesia;g__Agrobacterium;g__Diplosphaera;g__Proboscivirus;g__Hydrogenophaga g__Rathayibacter;g__Proboscivirus;g__Hydrogenophaga;g__Spiribacter;g__Diplosphaera g__Parabacteroides;g__Caedibacter;g__Proboscivirus
Neoplasm.Status g__Fimbriimonas;g__Avibacterium;g__Agromyces;g__Pedobacter;g__Leptospira;g__Moraxella g__Desulfobulbus;g__Spo1virus;g__Leptospira;g__Fimbriimonas;g__Psychrilyobacter g__Fimbriimonas;g__Avibacterium;g__Caldimonas;g__Alphabaculovirus;g__Spo1virus g__Caldimonas;g__Spiroplasma;g__Blastomonas;g__Parabacteroides

MS & PHENOTYPE

clinic$mRNA_K4_new <- ifelse(str_detect(clinic$mRNA_K4,'proliferation'),NA,clinic$mRNA_K4)
pheno <- c('Histology','C1A_C1B','mRNA_K4_new','miRNA_cluster','MethyLevel','SCNA_cluster','protein_cluster','COC')
thm <- theme_bw()+
  theme(panel.border = element_rect(),
        panel.grid.major = element_blank(),   
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(colour = "black",size=0.5),
        text = element_text(size = 20),
        axis.text.x = element_text(size=25,color ='black'),
        axis.text.y = element_text(size = 20,color = 'black'),
        axis.title.y = element_text(size = 20),
        plot.subtitle  = element_text(face = 'italic',hjust = 0.5),
        legend.position='bottom',
        legend.text = element_text(size = 15,hjust = 0),
        legend.title = element_text(size = 20)
  )

# plot for phenos
data_type <- c("Without_combined","Likely_combined","PC_combined","Putative_combined")
for (j in 1:4) {
  clinic$MS <- clinic[,data_type[j]]
  for (i in 1:length(pheno)) {
    dat <- data.frame(table(clinic[,c('MS',pheno[i])])) %>% 
      ddply(.(MS),transform,percent=Freq/sum(Freq)*100) 
    colnames(dat)[1:2] <- c('Var1','Var2')
    x <- matrix(dat$Freq,ncol=2)
    if (sum(x<5)>0){
      pvalue <- fisher.test(x)$p.value
      # print(pvalue)
    }else{
      pvalue <- chisq.test(x)$p.value
      # print(pvalue)
    }
    legend_n <- length(unique(dat$Var2))
    p <- dat%>%
      drop_na() %>% 
      ggplot(aes(fill=Var2,x=Var1,y=Freq))+
      geom_bar(position = 'fill',stat = 'identity',width = 0.8)+
      guides(fill=guide_legend(title=pheno[i],direction = "horizontal",
                               nrow=ifelse(legend_n>4,3,ifelse(legend_n>1,2,1)))) +
      scale_fill_manual(values = c(pal_nejm(alpha = 0.9)(3)[2:3],pal_nejm(alpha = 0.9)(1),pal_nejm(alpha = 0.9)(8)[4:8]))+
      scale_y_continuous(labels = scales::percent)+
      coord_fixed(ratio = 5/2)+
      labs(x='',y='Proportion %',title = data_type[j],
           subtitle = paste0("P ", ifelse(pvalue<0.001, "< 0.001", paste0("= ",round(pvalue,3)))))+
      thm
    print(p)
    # dev.off()
  }
}

# MS & ADS
for (i in 1:4) {
  clinic$MS <- clinic[,data_type[i]]
  p <- ggplot(data = clinic,aes(x=MS, y=ADS)) + 
    geom_violin(trim=T) +
    geom_jitter(shape=16, color="grey",size=2.0,position=position_jitter(0.2))+
    geom_boxplot(width=0.1,position=position_dodge(0.8),aes(fill=MS))+
    thm+
    theme(legend.position = 'right')+
    scale_fill_manual(values = c(MS1="#0072B5CC",MS2="#E18727CC"))+
    guides(fill=guide_legend(direction = "horizontal",ncol =1))+
    labs(x="", y = "ADS ", fill = "",title = data_type[i])+  
    stat_compare_means(aes(x = MS , y = ADS),label = "p.format",size=6)
  print(p)
}